libraries necessaires

#install.packages("corrplot")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(corrplot)
## corrplot 0.95 loaded
library(magrittr) # syntaxe, notamment affectation %<>%
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(GGally)   # plot pairs better than default plot
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(plotly)   # plots interactifs
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(flextable)
## 
## Attaching package: 'flextable'
## 
## The following objects are masked from 'package:plotly':
## 
##     highlight, style
## 
## The following object is masked from 'package:purrr':
## 
##     compose
library(tibble)
library(e1071)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(keras)
library(tensorflow)
## 
## Attaching package: 'tensorflow'
## 
## The following object is masked from 'package:caret':
## 
##     train
read_csv("data/heart.csv")
## Rows: 918 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sex, ChestPainType, RestingECG, ExerciseAngina, ST_Slope
## dbl (7): Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak, HeartDisease
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 918 × 12
##      Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##    <dbl> <chr> <chr>             <dbl>       <dbl>     <dbl> <chr>      <dbl>
##  1    40 M     ATA                 140         289         0 Normal       172
##  2    49 F     NAP                 160         180         0 Normal       156
##  3    37 M     ATA                 130         283         0 ST            98
##  4    48 F     ASY                 138         214         0 Normal       108
##  5    54 M     NAP                 150         195         0 Normal       122
##  6    39 M     NAP                 120         339         0 Normal       170
##  7    45 F     ATA                 130         237         0 Normal       170
##  8    54 M     ATA                 110         208         0 Normal       142
##  9    37 M     ASY                 140         207         0 Normal       130
## 10    48 F     ATA                 120         284         0 Normal       120
## # ℹ 908 more rows
## # ℹ 4 more variables: ExerciseAngina <chr>, Oldpeak <dbl>, ST_Slope <chr>,
## #   HeartDisease <dbl>
tb <- read_csv("data/heart.csv") %>%
  mutate_if(is.character, factor)
## Rows: 918 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sex, ChestPainType, RestingECG, ExerciseAngina, ST_Slope
## dbl (7): Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak, HeartDisease
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(tb)
## # A tibble: 6 × 12
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <fct> <fct>             <dbl>       <dbl>     <dbl> <fct>      <dbl>
## 1    40 M     ATA                 140         289         0 Normal       172
## 2    49 F     NAP                 160         180         0 Normal       156
## 3    37 M     ATA                 130         283         0 ST            98
## 4    48 F     ASY                 138         214         0 Normal       108
## 5    54 M     NAP                 150         195         0 Normal       122
## 6    39 M     NAP                 120         339         0 Normal       170
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## #   HeartDisease <dbl>
tb$HeartDisease
##   [1] 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1
##  [38] 0 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0
##  [75] 1 0 1 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0
## [112] 1 0 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0 0
## [149] 0 1 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0
## [186] 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1 1
## [223] 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0
## [260] 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1
## [297] 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0
## [334] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
## [371] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 1 1
## [445] 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1
## [482] 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 1 1
## [519] 1 1 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 0
## [556] 1 0 1 1 1 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0
## [593] 1 1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 0 0 0 1
## [630] 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 0 0 1 0 0 0 1 0 1 1 1 1 1
## [667] 0 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0
## [704] 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 1 1 0 1 1 1 1 0 0
## [741] 0 1 0 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0 1 1 1
## [778] 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0
## [815] 1 0 1 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1
## [852] 0 1 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1
## [889] 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 0
c('Normale','Malade')[ tb$HeartDisease+1] %>% # tb$HeartDisease donne l'indice à aller chercher dans le vecteur ['Normale','Malade']
  as.factor
##   [1] Normale Malade  Normale Malade  Normale Normale Normale Normale Malade 
##  [10] Normale Normale Malade  Normale Malade  Normale Normale Malade  Normale
##  [19] Malade  Malade  Normale Normale Normale Malade  Normale Normale Normale
##  [28] Normale Normale Normale Malade  Normale Malade  Malade  Normale Normale
##  [37] Malade  Normale Normale Normale Normale Malade  Normale Normale Malade 
##  [46] Normale Normale Normale Normale Malade  Malade  Malade  Normale Normale
##  [55] Normale Normale Malade  Malade  Normale Malade  Normale Normale Normale
##  [64] Malade  Normale Normale Normale Normale Malade  Normale Malade  Normale
##  [73] Malade  Normale Malade  Normale Malade  Normale Normale Malade  Normale
##  [82] Normale Malade  Normale Malade  Malade  Malade  Normale Malade  Normale
##  [91] Normale Normale Normale Malade  Normale Malade  Normale Normale Normale
## [100] Normale Malade  Normale Malade  Malade  Malade  Normale Normale Normale
## [109] Normale Normale Normale Malade  Normale Normale Normale Malade  Malade 
## [118] Malade  Normale Malade  Malade  Normale Normale Malade  Normale Normale
## [127] Normale Normale Normale Normale Normale Malade  Malade  Malade  Normale
## [136] Malade  Normale Normale Malade  Malade  Malade  Malade  Malade  Normale
## [145] Malade  Normale Normale Normale Normale Malade  Normale Normale Normale
## [154] Normale Normale Malade  Malade  Normale Malade  Normale Malade  Malade 
## [163] Normale Normale Normale Malade  Malade  Normale Normale Normale Normale
## [172] Normale Normale Normale Malade  Malade  Malade  Normale Normale Normale
## [181] Malade  Normale Malade  Normale Normale Malade  Normale Malade  Normale
## [190] Malade  Normale Normale Normale Normale Normale Normale Normale Normale
## [199] Malade  Normale Normale Normale Normale Normale Normale Normale Normale
## [208] Malade  Normale Malade  Malade  Malade  Normale Normale Malade  Normale
## [217] Malade  Normale Normale Normale Malade  Malade  Normale Normale Normale
## [226] Malade  Normale Malade  Normale Normale Normale Normale Normale Normale
## [235] Normale Normale Malade  Malade  Malade  Malade  Normale Malade  Malade 
## [244] Normale Malade  Normale Malade  Malade  Malade  Malade  Malade  Malade 
## [253] Normale Normale Malade  Normale Normale Normale Normale Normale Normale
## [262] Normale Malade  Malade  Malade  Normale Malade  Normale Malade  Normale
## [271] Normale Normale Malade  Normale Normale Normale Malade  Malade  Normale
## [280] Normale Normale Malade  Normale Normale Normale Normale Normale Normale
## [289] Normale Normale Normale Normale Normale Malade  Malade  Malade  Malade 
## [298] Malade  Malade  Malade  Malade  Normale Malade  Malade  Malade  Malade 
## [307] Malade  Normale Malade  Malade  Normale Malade  Malade  Malade  Normale
## [316] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [325] Malade  Malade  Normale Malade  Malade  Malade  Malade  Malade  Normale
## [334] Malade  Malade  Malade  Normale Malade  Malade  Malade  Malade  Malade 
## [343] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [352] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [361] Malade  Malade  Malade  Malade  Normale Malade  Malade  Malade  Malade 
## [370] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [379] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [388] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [397] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [406] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [415] Malade  Malade  Malade  Normale Malade  Malade  Normale Normale Malade 
## [424] Normale Malade  Malade  Normale Malade  Malade  Malade  Malade  Normale
## [433] Malade  Malade  Normale Normale Malade  Malade  Malade  Normale Malade 
## [442] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [451] Malade  Malade  Malade  Normale Malade  Normale Malade  Malade  Malade 
## [460] Normale Malade  Malade  Malade  Normale Malade  Normale Malade  Normale
## [469] Malade  Normale Malade  Malade  Malade  Malade  Normale Malade  Normale
## [478] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [487] Normale Malade  Normale Malade  Malade  Malade  Malade  Malade  Malade 
## [496] Malade  Normale Malade  Malade  Malade  Malade  Malade  Malade  Normale
## [505] Malade  Malade  Malade  Normale Malade  Malade  Normale Malade  Normale
## [514] Malade  Malade  Normale Malade  Malade  Malade  Malade  Normale Malade 
## [523] Malade  Malade  Normale Normale Malade  Normale Malade  Malade  Malade 
## [532] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Normale
## [541] Malade  Malade  Malade  Malade  Normale Normale Malade  Malade  Malade 
## [550] Normale Malade  Normale Malade  Malade  Normale Malade  Normale Malade 
## [559] Malade  Malade  Normale Normale Normale Malade  Malade  Malade  Normale
## [568] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade  Malade 
## [577] Malade  Malade  Malade  Malade  Malade  Malade  Malade  Normale Malade 
## [586] Malade  Malade  Normale Malade  Malade  Normale Normale Malade  Malade 
## [595] Malade  Malade  Malade  Normale Malade  Malade  Normale Malade  Malade 
## [604] Malade  Normale Normale Malade  Malade  Malade  Malade  Malade  Normale
## [613] Malade  Normale Malade  Malade  Normale Malade  Normale Normale Normale
## [622] Malade  Malade  Malade  Malade  Normale Normale Normale Malade  Normale
## [631] Normale Malade  Malade  Normale Normale Malade  Normale Normale Normale
## [640] Normale Normale Normale Normale Malade  Normale Malade  Normale Normale
## [649] Malade  Malade  Malade  Malade  Malade  Normale Normale Malade  Normale
## [658] Normale Normale Malade  Normale Malade  Malade  Malade  Malade  Malade 
## [667] Normale Normale Normale Normale Normale Malade  Normale Malade  Malade 
## [676] Normale Malade  Normale Normale Normale Malade  Normale Malade  Normale
## [685] Malade  Malade  Normale Normale Normale Normale Malade  Normale Normale
## [694] Normale Normale Malade  Malade  Malade  Normale Normale Normale Normale
## [703] Normale Normale Malade  Normale Malade  Malade  Malade  Malade  Malade 
## [712] Normale Malade  Normale Normale Normale Malade  Normale Malade  Malade 
## [721] Malade  Normale Malade  Malade  Normale Malade  Normale Malade  Normale
## [730] Normale Normale Malade  Malade  Normale Malade  Malade  Malade  Malade 
## [739] Normale Normale Normale Malade  Normale Normale Malade  Malade  Malade 
## [748] Normale Malade  Normale Normale Normale Malade  Normale Normale Malade 
## [757] Normale Malade  Normale Malade  Malade  Malade  Malade  Malade  Normale
## [766] Normale Normale Normale Normale Normale Normale Malade  Normale Normale
## [775] Malade  Malade  Malade  Normale Malade  Normale Normale Normale Normale
## [784] Normale Malade  Normale Malade  Malade  Normale Normale Malade  Malade 
## [793] Malade  Malade  Normale Normale Malade  Malade  Normale Normale Normale
## [802] Malade  Normale Normale Malade  Normale Malade  Normale Malade  Normale
## [811] Normale Normale Normale Normale Malade  Normale Malade  Malade  Malade 
## [820] Malade  Normale Normale Normale Malade  Normale Malade  Normale Normale
## [829] Malade  Normale Normale Normale Normale Normale Normale Malade  Malade 
## [838] Normale Malade  Normale Normale Malade  Malade  Normale Normale Malade 
## [847] Malade  Normale Malade  Normale Malade  Normale Malade  Normale Normale
## [856] Malade  Normale Normale Malade  Normale Malade  Malade  Normale Malade 
## [865] Malade  Malade  Normale Malade  Normale Normale Normale Normale Malade 
## [874] Malade  Normale Normale Malade  Malade  Normale Malade  Normale Normale
## [883] Normale Normale Malade  Normale Normale Malade  Malade  Malade  Normale
## [892] Normale Normale Malade  Normale Malade  Normale Malade  Normale Malade 
## [901] Malade  Malade  Normale Normale Normale Malade  Normale Malade  Malade 
## [910] Malade  Normale Malade  Malade  Malade  Malade  Malade  Malade  Normale
## Levels: Malade Normale

Modification du tibble : fonction mutate

tb=  tb %>% 
    mutate(HeartDisease=as.factor( c('Normale','Malade'))[ HeartDisease +1] ) 

tb$HeartDisease=as.factor(tb$HeartDisease)

Ceci permet de remplacer les factors 0,1 par normale et malade

sum(is.na(data))
## Warning in is.na(data): is.na() applied to non-(list or vector) of type
## 'closure'
## [1] 0

Les valeurs numeriques

numerical_columns <- c("Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak", "HeartDisease")
tb_num= tb[numerical_columns]
tb_num
## # A tibble: 918 × 6
##      Age RestingBP Cholesterol MaxHR Oldpeak HeartDisease
##    <dbl>     <dbl>       <dbl> <dbl>   <dbl> <fct>       
##  1    40       140         289   172     0   Normale     
##  2    49       160         180   156     1   Malade      
##  3    37       130         283    98     0   Normale     
##  4    48       138         214   108     1.5 Malade      
##  5    54       150         195   122     0   Normale     
##  6    39       120         339   170     0   Normale     
##  7    45       130         237   170     0   Normale     
##  8    54       110         208   142     0   Normale     
##  9    37       140         207   130     1.5 Malade      
## 10    48       120         284   120     0   Normale     
## # ℹ 908 more rows

En x les classes, et pour chaque classe, les effectifs, sous forme de barplot :

tb %>%
  ggplot(aes(x=HeartDisease, fill=HeartDisease)) + # aes pour aesthetic
  geom_bar(stat = 'count') + # https://ggplot2.tidyverse.org/reference/geom_bar.html
  geom_text(stat='count', aes(label=..count..), vjust=-0.5) # pour afficher les effectifs
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Transformation : pivot_longer de tidyr (plusieurs variables → (variable name, variable value) )

Manip pour avoir des tuples (classe, variable, value)

tbg = tb_num %>% 
  rowid_to_column() %>% # ajout d'un identifiant de ligne 
  pivot_longer(Age:Oldpeak, names_to='variable', values_to='value')
tbg
## # A tibble: 4,590 × 4
##    rowid HeartDisease variable    value
##    <int> <fct>        <chr>       <dbl>
##  1     1 Normale      Age            40
##  2     1 Normale      RestingBP     140
##  3     1 Normale      Cholesterol   289
##  4     1 Normale      MaxHR         172
##  5     1 Normale      Oldpeak         0
##  6     2 Malade       Age            49
##  7     2 Malade       RestingBP     160
##  8     2 Malade       Cholesterol   180
##  9     2 Malade       MaxHR         156
## 10     2 Malade       Oldpeak         1
## # ℹ 4,580 more rows

rowid sera nécessaire si on veut revenir au format de départ sinon impossible de savoir quelle mesures (d’alcool, de ash, …) il faut rassembler pour reconstruire le tableau de départ.

tbg %>%
  ggplot(aes(HeartDisease, value, color=HeartDisease)) +
  geom_boxplot() +
  facet_wrap(~ variable)

Conclusion : On voit bien que Age et Oldpeak sont écrasé par les autres Variables quantitatives. Il est donc pertinent de normaliser

Plot interactif avec plotly (on sauvegarde le résultat de ggplot et on le passe à ggplotly)

p = tbg %>%
  ggplot(aes(HeartDisease, value, color=HeartDisease)) +
  geom_boxplot() +
  facet_wrap(~ variable)
ggplotly(p)

On enregistre les paramètres de normalisation (z-score) de chacune des variables :

znorm = tbg %>% 
  group_by(variable) %>%
  summarize(mean=mean(value), sd=sd(value), min = min(value), max=max(value), median=median(value))
znorm
## # A tibble: 5 × 6
##   variable       mean     sd   min   max median
##   <chr>         <dbl>  <dbl> <dbl> <dbl>  <dbl>
## 1 Age          53.5     9.43  28    77     54  
## 2 Cholesterol 199.    109.     0   603    223  
## 3 MaxHR       137.     25.5   60   202    138  
## 4 Oldpeak       0.887   1.07  -2.6   6.2    0.6
## 5 RestingBP   132.     18.5    0   200    130

jointure et ajout des z-scores

tbg %<>% 
  inner_join(znorm, by='variable') %>% 
  mutate(value.z = (value-mean)/sd)
tbg
## # A tibble: 4,590 × 10
##    rowid HeartDisease variable   value    mean     sd   min   max median value.z
##    <int> <fct>        <chr>      <dbl>   <dbl>  <dbl> <dbl> <dbl>  <dbl>   <dbl>
##  1     1 Normale      Age           40  53.5     9.43  28    77     54    -1.43 
##  2     1 Normale      RestingBP    140 132.     18.5    0   200    130     0.411
##  3     1 Normale      Cholester…   289 199.    109.     0   603    223     0.825
##  4     1 Normale      MaxHR        172 137.     25.5   60   202    138     1.38 
##  5     1 Normale      Oldpeak        0   0.887   1.07  -2.6   6.2    0.6  -0.832
##  6     2 Malade       Age           49  53.5     9.43  28    77     54    -0.478
##  7     2 Malade       RestingBP    160 132.     18.5    0   200    130     1.49 
##  8     2 Malade       Cholester…   180 199.    109.     0   603    223    -0.172
##  9     2 Malade       MaxHR        156 137.     25.5   60   202    138     0.754
## 10     2 Malade       Oldpeak        1   0.887   1.07  -2.6   6.2    0.6   0.106
## # ℹ 4,580 more rows

Vérification de la normalisation (moyenne à 0 et écart-type à 1)

tbg %>% 
  group_by(variable) %>% 
  summarize(moyenne=round(mean(value.z), 4), `écart-type`=sd(value.z)) # nom de colonne avec un é pour illustrer mais à éviter en général
## # A tibble: 5 × 3
##   variable    moyenne `écart-type`
##   <chr>         <dbl>        <dbl>
## 1 Age               0            1
## 2 Cholesterol       0            1
## 3 MaxHR             0            1
## 4 Oldpeak           0            1
## 5 RestingBP         0            1

Visualisation des distributions avec ggplot + facet

Visualisation

tbg %>%
  ggplot(aes(HeartDisease, value.z, color=HeartDisease)) +
  geom_violin() +
  geom_jitter(alpha=.3, width=.15, size=0.5) +
  facet_wrap(~ variable)

Conclusion : Maintenant les variables quantitatives sont comparables.

# Normalisation des variables numériques avec scale
tb_numpur=tb[numerical_columns] %>% 
  select(-HeartDisease)
tb_numpur <- scale(tb_numpur)
colSums(tb_numpur)
##           Age     RestingBP   Cholesterol         MaxHR       Oldpeak 
## -1.107239e-13  1.838182e-13  1.890328e-14  4.727919e-13 -4.220582e-15
# Calcul de la matrice de corrélation
correlation_matrix <- cor(tb_numpur)

# Affichage de la matrice de corrélation
print(correlation_matrix)
##                     Age  RestingBP Cholesterol      MaxHR     Oldpeak
## Age          1.00000000  0.2543994 -0.09528177 -0.3820447  0.25861154
## RestingBP    0.25439936  1.0000000  0.10089294 -0.1121350  0.16480304
## Cholesterol -0.09528177  0.1008929  1.00000000  0.2357924  0.05014811
## MaxHR       -0.38204468 -0.1121350  0.23579240  1.0000000 -0.16069055
## Oldpeak      0.25861154  0.1648030  0.05014811 -0.1606906  1.00000000
# Visualisation de la corrélation
corrplot(correlation_matrix, method = "circle")

Conclusion : Si deux variables sont fortement corrélées (|corr| > 0.8), l’une peut être supprimée car elles apportent la même information. Mais ici, toutes les corrélations sont faibles, donc aucune variable n’est totalement redondante et donc on garde nos toutes belles variables hhh.

# Tester l'indépendance avec la variable cible (Cardiopathie) pour chaque variable catégorielle
tb$FastingBS= as.factor(tb$FastingBS)
categorical_columns <- c("Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope", "FastingBS")
for (col in categorical_columns) {
  #cat("\nTest du chi-carré pour", col, ":\n")
  test_result <- chisq.test(table(tb[[col]], tb$HeartDisease))
  print(test_result)
}
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(tb[[col]], tb$HeartDisease)
## X-squared = 84.145, df = 1, p-value < 2.2e-16
## 
## 
##  Pearson's Chi-squared test
## 
## data:  table(tb[[col]], tb$HeartDisease)
## X-squared = 268.07, df = 3, p-value < 2.2e-16
## 
## 
##  Pearson's Chi-squared test
## 
## data:  table(tb[[col]], tb$HeartDisease)
## X-squared = 10.931, df = 2, p-value = 0.004229
## 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(tb[[col]], tb$HeartDisease)
## X-squared = 222.26, df = 1, p-value < 2.2e-16
## 
## 
##  Pearson's Chi-squared test
## 
## data:  table(tb[[col]], tb$HeartDisease)
## X-squared = 355.92, df = 2, p-value < 2.2e-16
## 
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(tb[[col]], tb$HeartDisease)
## X-squared = 64.321, df = 1, p-value = 1.057e-15
# La méthode de prof
tb_cat= tb[categorical_columns]
x= tb[categorical_columns]
y=tb$HeartDisease
sapply(colnames(x), function(a)chisq.test(x=x[,a], y=y)$p.value)
##            Sex  ChestPainType     RestingECG ExerciseAngina       ST_Slope 
##   4.597617e-20   8.083728e-58   4.229233e-03   2.907808e-50   5.167638e-78 
##      FastingBS 
##   1.057302e-15

Conclusion : p-value < 0.05 : Il y a une relation significative entre toutes les variables catégorielles et HeartDisease. Toutes les variables catégorielles sont informatives et doivent être conservées.

Liaison attribut-classe

Table de contingence

table(tb$HeartDisease, tb_cat$ChestPainType)
##          
##           ASY ATA NAP  TA
##   Malade  392  24  72  20
##   Normale 104 149 131  26

Visualisation graphique

 plot(table(tb_cat$ChestPainType, tb$HeartDisease), main='ChestPainType')

tb %>%
  ggbivariate(outcome="HeartDisease", explanatory= colnames(tb[categorical_columns]))

# Affichage
foo=sapply(colnames(x), 
           function(a)
             plot(
               table(unlist(x[,a]), y),
               main=a
             
             )
)

Conclusion générale : Toutes les variables sont pertinentes pour la suite de l’analyse.

colonne_numerique <- c("Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak")
tb_final=tb
tb_final[colonne_numerique]= scale(tb_final[colonne_numerique])
tb_final
## # A tibble: 918 × 12
##        Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG  MaxHR
##      <dbl> <fct> <fct>             <dbl>       <dbl> <fct>     <fct>       <dbl>
##  1 -1.43   M     ATA               0.411      0.825  0         Normal      1.38 
##  2 -0.478  F     NAP               1.49      -0.172  0         Normal      0.754
##  3 -1.75   M     ATA              -0.129      0.770  0         ST         -1.52 
##  4 -0.584  F     ASY               0.303      0.139  0         Normal     -1.13 
##  5  0.0519 M     NAP               0.951     -0.0347 0         Normal     -0.582
##  6 -1.54   M     NAP              -0.670      1.28   0         Normal      1.30 
##  7 -0.902  F     ATA              -0.129      0.349  0         Normal      1.30 
##  8  0.0519 M     ATA              -1.21       0.0841 0         Normal      0.204
##  9 -1.75   M     ASY               0.411      0.0750 0         Normal     -0.267
## 10 -0.584  F     ATA              -0.670      0.779  0         Normal     -0.660
## # ℹ 908 more rows
## # ℹ 4 more variables: ExerciseAngina <fct>, Oldpeak <dbl>, ST_Slope <fct>,
## #   HeartDisease <fct>

Séparation des données en ensemble d’entraînement et de test

set.seed(42)  # Pour garantir la reproductibilité
trainIndex <- createDataPartition(tb_final$HeartDisease, p = 2/3, list = FALSE)
trainData <- tb[trainIndex, ]
testData <- tb[-trainIndex, ]

# Vérification des dimensions des ensembles
dim(trainData)
## [1] 613  12
dim(testData)
## [1] 305  12

Foret aleatoire

rf_model <- randomForest(HeartDisease ~ ., ntree = 50000, data = trainData, importance = TRUE) # Entraîner un modèle random forest sur l'ensemble d'entraînement


predictions <- predict(rf_model, newdata = testData) # Prédictions
confusionMatrix(predictions, testData$HeartDisease) # Matrice de confusion
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Malade Normale
##    Malade     155      26
##    Normale     14     110
##                                           
##                Accuracy : 0.8689          
##                  95% CI : (0.8257, 0.9046)
##     No Information Rate : 0.5541          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7323          
##                                           
##  Mcnemar's Test P-Value : 0.08199         
##                                           
##             Sensitivity : 0.9172          
##             Specificity : 0.8088          
##          Pos Pred Value : 0.8564          
##          Neg Pred Value : 0.8871          
##              Prevalence : 0.5541          
##          Detection Rate : 0.5082          
##    Detection Prevalence : 0.5934          
##       Balanced Accuracy : 0.8630          
##                                           
##        'Positive' Class : Malade          
## 
table(predictions,testData$HeartDisease)
##            
## predictions Malade Normale
##     Malade     155      26
##     Normale     14     110
importance(rf_model) # afficher l'importance des variables
##                   Malade   Normale MeanDecreaseAccuracy MeanDecreaseGini
## Age             32.02809  76.86594             77.94278        22.901831
## Sex            138.78582 142.99520            190.51302        10.557129
## ChestPainType  228.96091 242.17487            317.95004        41.499691
## RestingBP       63.79699  10.34377             56.12416        22.173653
## Cholesterol     97.68881 129.15609            154.38563        31.523692
## FastingBS      116.72690  96.60586            146.15849         7.512203
## RestingECG      40.55291  12.46801             39.32065         7.868187
## MaxHR          138.99369  11.65900            128.30665        29.339153
## ExerciseAngina 122.64457 164.80044            200.43852        25.493103
## Oldpeak         71.14422 230.03367            221.77873        31.061201
## ST_Slope       336.02894 531.89618            582.27555        68.487545
varImpPlot(rf_model, main = "Importance des variables") # visualiser l'importance des variables

Naive Bayesien

Entraînement sans lissage Laplace = 0

nb_model <- naiveBayes(HeartDisease ~ ., data = trainData)
nb_predictions <- predict(nb_model, newdata = testData)
confusionMatrix(nb_predictions, testData$HeartDisease)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Malade Normale
##    Malade     155      23
##    Normale     14     113
##                                           
##                Accuracy : 0.8787          
##                  95% CI : (0.8367, 0.9131)
##     No Information Rate : 0.5541          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7529          
##                                           
##  Mcnemar's Test P-Value : 0.1884          
##                                           
##             Sensitivity : 0.9172          
##             Specificity : 0.8309          
##          Pos Pred Value : 0.8708          
##          Neg Pred Value : 0.8898          
##              Prevalence : 0.5541          
##          Detection Rate : 0.5082          
##    Detection Prevalence : 0.5836          
##       Balanced Accuracy : 0.8740          
##                                           
##        'Positive' Class : Malade          
## 
print(nb_model)
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##    Malade   Normale 
## 0.5530179 0.4469821 
## 
## Conditional probabilities:
##          Age
## Y             [,1]     [,2]
##   Malade  55.63717 8.874598
##   Normale 50.55474 9.661994
## 
##          Sex
## Y                 F         M
##   Malade  0.1091445 0.8908555
##   Normale 0.3357664 0.6642336
## 
##          ChestPainType
## Y                ASY        ATA        NAP         TA
##   Malade  0.77581121 0.04129794 0.13274336 0.05014749
##   Normale 0.25182482 0.35036496 0.34671533 0.05109489
## 
##          RestingBP
## Y             [,1]     [,2]
##   Malade  132.8584 19.63869
##   Normale 130.0073 16.50064
## 
##          Cholesterol
## Y             [,1]     [,2]
##   Malade  174.6755 127.0620
##   Normale 225.9599  75.7364
## 
##          FastingBS
## Y                 0         1
##   Malade  0.6578171 0.3421829
##   Normale 0.8905109 0.1094891
## 
##          RestingECG
## Y               LVH    Normal        ST
##   Malade  0.2153392 0.5575221 0.2271386
##   Normale 0.1861314 0.6569343 0.1569343
## 
##          MaxHR
## Y             [,1]     [,2]
##   Malade  129.0413 22.75802
##   Normale 147.0000 22.74722
## 
##          ExerciseAngina
## Y                 N         Y
##   Malade  0.3834808 0.6165192
##   Normale 0.8722628 0.1277372
## 
##          Oldpeak
## Y              [,1]      [,2]
##   Malade  1.2351032 1.1172826
##   Normale 0.4233577 0.7420655
## 
##          ST_Slope
## Y               Down       Flat         Up
##   Malade  0.10619469 0.73156342 0.16224189
##   Normale 0.02189781 0.19343066 0.78467153

Réentraînement avec lissage de Laplace =1 pour éviter proba de 0

model_nb_laplace <- naiveBayes(HeartDisease ~ ., data = trainData, laplace = 1)

pred_nb_laplace <- predict(model_nb_laplace, testData)

confusionMatrix(pred_nb_laplace, testData$HeartDisease, positive = "Malade")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Malade Normale
##    Malade     156      23
##    Normale     13     113
##                                           
##                Accuracy : 0.882           
##                  95% CI : (0.8404, 0.9159)
##     No Information Rate : 0.5541          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7594          
##                                           
##  Mcnemar's Test P-Value : 0.1336          
##                                           
##             Sensitivity : 0.9231          
##             Specificity : 0.8309          
##          Pos Pred Value : 0.8715          
##          Neg Pred Value : 0.8968          
##              Prevalence : 0.5541          
##          Detection Rate : 0.5115          
##    Detection Prevalence : 0.5869          
##       Balanced Accuracy : 0.8770          
##                                           
##        'Positive' Class : Malade          
## 

Arbre de decision

division Gini

modele_tree <- rpart(HeartDisease ~ ., data = trainData, method = "class", parms = list(split = "gini"))

pred_tree <- predict(modele_tree, newdata = testData, type = "class") # Prédiction sur les données de test

mat_tree <- confusionMatrix(pred_tree, testData$HeartDisease, positive = "Normale")

print(mat_tree)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Malade Normale
##    Malade     153      32
##    Normale     16     104
##                                           
##                Accuracy : 0.8426          
##                  95% CI : (0.7968, 0.8816)
##     No Information Rate : 0.5541          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6778          
##                                           
##  Mcnemar's Test P-Value : 0.03038         
##                                           
##             Sensitivity : 0.7647          
##             Specificity : 0.9053          
##          Pos Pred Value : 0.8667          
##          Neg Pred Value : 0.8270          
##              Prevalence : 0.4459          
##          Detection Rate : 0.3410          
##    Detection Prevalence : 0.3934          
##       Balanced Accuracy : 0.8350          
##                                           
##        'Positive' Class : Normale         
## 
rpart.plot(modele_tree, type = 2, extra = 104, fallen.leaves = TRUE)

division Entropie

tree_entropy <- rpart(HeartDisease ~ ., data = trainData,
                      method = "class",
                      parms = list(split = "information"),
                      control = rpart.control(cp = 0.01, maxdepth = 5))

pred_tree_entropy <- predict(tree_entropy, testData, type = "class")
confusionMatrix(pred_tree_entropy, testData$HeartDisease, positive = "Malade")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Malade Normale
##    Malade     152      33
##    Normale     17     103
##                                           
##                Accuracy : 0.8361          
##                  95% CI : (0.7896, 0.8758)
##     No Information Rate : 0.5541          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.6644          
##                                           
##  Mcnemar's Test P-Value : 0.03389         
##                                           
##             Sensitivity : 0.8994          
##             Specificity : 0.7574          
##          Pos Pred Value : 0.8216          
##          Neg Pred Value : 0.8583          
##              Prevalence : 0.5541          
##          Detection Rate : 0.4984          
##    Detection Prevalence : 0.6066          
##       Balanced Accuracy : 0.8284          
##                                           
##        'Positive' Class : Malade          
## 

Tableau des confusions avec ggplot2

mat_conf <- matrix(c(156, 30, 16, 104),
                   nrow = 2, byrow = TRUE,
                   dimnames = list("Classe réelle" = c("Malade", "Normale"),
                                   "Classe prédite" = c("Malade", "Normale")))


df_long <- melt(mat_conf)
colnames(df_long) <- c("ClasseReelle", "ClassePredite", "Valeur")


df_long$ClasseReelle <- factor(df_long$ClasseReelle, levels = rev(c("Malade", "Normale")))


df_long$ClassePredite <- factor(df_long$ClassePredite, levels = c("Malade", "Normale"))

ggplot(df_long, aes(x = ClassePredite, y = ClasseReelle)) +
  geom_tile(aes(fill = Valeur), color = "#f0f0f0", linewidth = 1.2) +
  geom_text(aes(label = Valeur), color = "black", size = 6, fontface = "bold") +
  scale_fill_gradient2(low = "#e0f7fa", mid = "#80deea", high = "#006064", midpoint = 100, guide = "none") +
  labs(
    title = "Matrice de confusion du modèle Arbre de Décision",
    x = "Classe Prédite",
    y = "Classe Réelle"
  ) +
  theme_minimal(base_family = "Arial", base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 15, face = "bold", color = "#006064"),
    axis.text = element_text(face = "bold"),
    panel.grid = element_blank()
  )

Tableau de comparaison des modeles

# Les données
data <- tibble::tibble(
  Modèles = c("Random Forest", "Naïve Bayes", "K-Nearest Neighbors", "Arbre de Décision"),
  `Précision globale (Accuracy)` = c("86,6 %", "88,56 %", "87,02 %", "84,97 %"),
  Sensibilité = c("89,53 %", "88,95 %", "87,22 %", "77,61 %"),
  `Indice Kappa` = c("0,7267", "0,7682", "0,7355", "0,6778"),
  `Balanced Accuracy` = c("86,19 %", "88,51 %", "86,96 %", "84,15 %")
)

# Création du tableau flextable
flextable(data) %>%
  set_header_labels(
    Modèles = "Modèles",
    `Précision globale (Accuracy)` = "Précision globale (Accuracy)",
    Sensibilité = "Sensibilité",
    `Indice Kappa` = "Indice Kappa",
    `Balanced Accuracy` = "Balanced Accuracy"
  ) %>%
  bold(part = "header") %>%
  fontsize(size = 12, part = "all") %>%
  color(color = "black") %>%
  bg(part = "header", bg = "#DDEBF7") %>%
  autofit()

Modèles

Précision globale (Accuracy)

Sensibilité

Indice Kappa

Balanced Accuracy

Random Forest

86,6 %

89,53 %

0,7267

86,19 %

Naïve Bayes

88,56 %

88,95 %

0,7682

88,51 %

K-Nearest Neighbors

87,02 %

87,22 %

0,7355

86,96 %

Arbre de Décision

84,97 %

77,61 %

0,6778

84,15 %